In [1]:
# this cell is tagged as parameters for `papermill` parameterization
#input configs
altair_config = None
nipah_config = None
#E2 specific files
func_scores_E2_file = None
binding_E2_file = None
#E3 specific files
func_scores_E3_file = None
binding_E3_file = None
#merged_files
merged_df_file = None
concat_df_file = None
#output plots
output_corr = None
entry_binding_corr_plot_E2_output = None
entry_binding_corr_plot_E3_output = None
corr_entry_binding_large_output = None
combined_binding_output = None
entry_by_site_plot_e2_output = None
entry_by_site_plot_e3_output = None
In [2]:
# Parameters
altair_config = "data/custom_analyses_data/interactive_theme.py"
nipah_config = "nipah_config.yaml"
func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
merged_df_file = "results/filtered_data/entry/e2_e3_entry_filter_merged.csv"
concat_df_file = "results/filtered_data/entry/e2_e3_entry_filter_concat.csv"
output_corr = "results/images/corr_heatmap.html"
entry_binding_corr_plot_E2_output = "results/images/entry_binding_corr_plot_E2.html"
entry_binding_corr_plot_E3_output = "results/images/entry_binding_corr_plot_E3.html"
corr_entry_binding_large_output = "results/images/corr_entry_binding_large.html"
combined_binding_output = "results/images/combined_binding.html"
entry_by_site_plot_e2_output = "results/images/entry_by_site_plot_e2.html"
entry_by_site_plot_e3_output = "results/images/entry_by_site_plot_e3.html"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
pass
print("Already in correct directory")
else:
os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
print("Setup in correct directory")
Setup in correct directory
Setup input file paths¶
In [5]:
if nipah_config is None:
#input files
altair_config = 'data/custom_analyses_data/interactive_theme.py'
nipah_config = 'nipah_config.yaml'
func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
antibody_file = 'results/filtered_data/escape/mab_filter_concat.csv'
merged_df_file = 'results/filtered_data/entry/e2_e3_entry_filter_merged.csv'
concat_df_file = 'results/filtered_data/entry/e2_e3_entry_filter_concat.csv'
In [6]:
if altair_config:
with open(altair_config, 'r') as file:
exec(file.read())
with open(nipah_config) as f:
config = yaml.safe_load(f)
Import filtered data¶
In [7]:
merged_df = pd.read_csv(merged_df_file) #merged entry scores
#ab_df = pd.read_csv(antibody_file)
concat_df = pd.read_csv(concat_df_file)
In [8]:
# Read filtered cell entry data
def read_func_data(file,name):
effect_df = pd.read_csv(file)
effect_df = effect_df[['site','wildtype','mutant','effect']]
effect_df['cell_type'] = name
return effect_df
# Call func to read in cell entry data
e2_func_df = read_func_data(func_scores_E2_file, 'CHO-EFNB2')
e3_func_df = read_func_data(func_scores_E3_file, 'CHO-EFNB3')
# Read filtered binding data
def read_binding_data(file,name):
binding_df = pd.read_csv(file)
binding_df = binding_df[['site','wildtype','mutant','binding_mean']]
binding_df['cell_type'] = name
return binding_df
# Call func to read in binding data
e2_bind_df = read_binding_data(binding_E2_file,'CHO-EFNB2')
e3_bind_df = read_binding_data(binding_E3_file,'CHO-EFNB3')
# Concat binding and func data, then merge
def concat_dfs(bind1,bind2,entry1,entry2):
combo_bind_df = pd.concat([bind1,bind2])
combo_entry_df = pd.concat([entry1,entry2])
total_merged = pd.merge(combo_bind_df,combo_entry_df,on=['site','wildtype','mutant','cell_type'],how='outer')
return total_merged
final_merged_df = concat_dfs(e2_bind_df,e3_bind_df,e2_func_df,e3_func_df)
In [9]:
### Ok, now I have different inputs ready to go for plotting. Lets review
# I have my different entry dataframes
display(e2_func_df.head(2))
display(e3_func_df.head(2))
display(concat_df.head(2))
display(merged_df.head(2))
| site | wildtype | mutant | effect | cell_type | |
|---|---|---|---|---|---|
| 0 | 71 | Q | C | -1.750 | CHO-EFNB2 |
| 1 | 71 | Q | D | -1.164 | CHO-EFNB2 |
| site | wildtype | mutant | effect | cell_type | |
|---|---|---|---|---|---|
| 0 | 71 | Q | C | -0.7227 | CHO-EFNB3 |
| 1 | 71 | Q | D | -0.3884 | CHO-EFNB3 |
| site | wildtype | mutant | effect | effect_std | times_seen | n_selections | cell_type | wildtype_site | wt_type | mutant_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 71 | Q | C | -1.750 | 0.1777 | 4.625 | 8 | CHO-bEFNB2 | Q71 | hydrophilic | special |
| 1 | 71 | Q | D | -1.164 | 0.8890 | 4.500 | 8 | CHO-bEFNB2 | Q71 | hydrophilic | negative |
| site | wildtype | mutant | effect_E2 | effect_std_E2 | times_seen_E2 | n_selections_E2 | cell_type_E2 | wildtype_site_E2 | wt_type_E2 | mutant_type_E2 | effect_E3 | effect_std_E3 | times_seen_E3 | n_selections_E3 | cell_type_E3 | wildtype_site_E3 | wt_type_E3 | mutant_type_E3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 71 | Q | C | -1.750 | 0.1777 | 4.625 | 8.0 | CHO-bEFNB2 | Q71 | hydrophilic | special | -0.7227 | 0.7828 | 3.000 | 7.0 | CHO-bEFNB3 | Q71 | hydrophilic | special |
| 1 | 71 | Q | D | -1.164 | 0.8890 | 4.500 | 8.0 | CHO-bEFNB2 | Q71 | hydrophilic | negative | -0.3884 | 0.6369 | 3.429 | 7.0 | CHO-bEFNB3 | Q71 | hydrophilic | negative |
Make heatmap of correlations between entry in CHO-bEFNB2 and CHO-bEFNB3¶
In [10]:
def correlation_heatmap(df):
chart = (
alt.Chart(df,title=alt.Title('Effects of RBP mutations on entry',subtitle='Between CHO cells expressing bat EFNB2 or EFNB3'))
.mark_rect()
.encode(
alt.X("effect_E2", title="Entry in CHO-bEFNB2",axis=alt.Axis(values=[-4,-3,-2,-1,0,1])).bin(maxbins=75),
alt.Y("effect_E3", title="Entry in CHO-bEFNB3",axis=alt.Axis(values=[-4,-3,-2,-1,0,1])).bin(maxbins=75),
alt.Color('count():Q',title='Count').scale(type='log'),
tooltip=['count()'],
)
).properties(
height=400,
width=400,
).configure_legend(
padding=2,
orient='top-left', #"left", "right", "top", "bottom", "top-left", "top-right", "bottom-left", "bottom-right", "none"
labelFontSize=16,
titlePadding=2,
symbolSize=100,
)
return chart
corr_heatmap = correlation_heatmap(merged_df)
corr_heatmap.display()
if entry_by_site_plot_e3_output is not None:
corr_heatmap.save(output_corr)
Make interactive plot linking individual binding and entry effects with top 10 summed binding and entry¶
In [11]:
def plot_entry_binding_interactive(df,name):
#find contact sites
df_copy = df.copy()
df_copy.loc[:, 'is_contact'] = df_copy['site'].isin(config['contact_sites'])
# Initialize a selection brush for interactive filtering.
brush = alt.selection_interval()
#scatter plot
chart = alt.Chart(
df_copy
).mark_point(
filled=True,
size=50
).encode(
alt.X("effect", title="Cell Entry", axis=alt.Axis(values=[-2,-1,0,1])),
alt.Y("binding_mean", title="Binding", axis=alt.Axis(values=[-4,-2,0,2])),
color=alt.condition(brush, 'is_contact', alt.value('lightgray')), # Conditional color encoding based on selection.
tooltip=["site", "wildtype", "mutant", "binding_mean","effect"] # Define tooltip content for additional data on hover.
).add_params(
brush
).properties(
width=400,
height=400
)
# Create a bar chart showing the sum of binding_median values for the top 10 sites filtered by the selection.
bars_binding = alt.Chart(df_copy).transform_filter(
brush # Apply the selection filter to include only selected data.
).transform_aggregate(
binding_aggr='sum(binding_mean)', # Aggregate data by summing up binding_median.
groupby=['site', 'is_contact']
).transform_window(
rank='rank(binding_aggr)', # Rank sites based on the aggregated sum.
sort=[alt.SortField('binding_aggr', order='descending')] # Sort by descending order of sum.
).transform_filter(
alt.datum.rank <= 10 # Filter to keep only the top 10 ranked sites.
).mark_bar().encode(
x=alt.X('site:N', sort='-y', title='Site', axis=alt.Axis(labelAngle=-90)), # Encode site names on x-axis with custom sorting and label angle.
y=alt.Y('binding_aggr:Q', title='Binding'), # Encode aggregated sum on y-axis.
color=alt.Color('is_contact', title='Receptor Contact Site') # Color bars based on whether they are contact sites.
).properties(width=200, height=50) # Set size of the bar chart.
# Similar to the bars chart for binding_median, but aggregates and ranks sites based on the 'effect' value.
bars_effect = alt.Chart(df_copy,title='Top 10').transform_filter(
brush
).transform_aggregate(
effect_aggr='sum(effect)',
groupby=['site', 'is_contact']
).transform_window(
rank='rank(effect_aggr)',
sort=[alt.SortField('effect_aggr', order='descending')]
).transform_filter(
alt.datum.rank <= 10
).mark_bar().encode(
x=alt.X('site:N', sort='-y', title='Site', axis=alt.Axis(labelAngle=-90)),
y=alt.Y('effect_aggr:Q', title='Entry'),
color=alt.Color('is_contact', title='Receptor Contact Site')
).properties(width=200, height=50)
# Combine the scatter plot with the two bar charts (stacked vertically and placed side by side).
combined_chart = chart & (bars_effect | bars_binding)
combined_chart = combined_chart.properties(
title={
"text": f"Cell Entry and Binding Analysis for {name}",
"subtitle": ["Draw box in scatterplot to show the top 10 sites by",
"summed binding and cell entry"],
"color": "black",
"subtitleColor": "gray"
}
)
return combined_chart
entry_binding_corr_plot_E2 = plot_entry_binding_interactive(final_merged_df.query('cell_type == "CHO-EFNB2"'),'bEFNB2')
entry_binding_corr_plot_E2.display()
if entry_by_site_plot_e3_output is not None:
entry_binding_corr_plot_E2.save(entry_binding_corr_plot_E2_output)
Now do the same above for for EFNB3¶
In [12]:
entry_binding_corr_plot_E3 = plot_entry_binding_interactive(final_merged_df.query('cell_type == "CHO-EFNB3"'),'bEFNB3')
entry_binding_corr_plot_E3.display()
if entry_by_site_plot_e3_output is not None:
entry_binding_corr_plot_E3.save(entry_binding_corr_plot_E3_output)
Make plot based on region¶
In [13]:
def find_domain(df):
barrel_ranges = {
"Stalk": list(range(70, 148)),
"Neck": list(range(148, 166)),
"Linker": list(range(166, 178)),
#"Receptor Contact": config["contact_sites"],
"Head": list(range(178, 602)),
#"Total": list(range(71, 602)),
}
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = df[df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{
"region": barrel,
"binding_mean": row["binding_mean"],
"effect": row['effect'],
"site": row["site"],
"mutant": row["mutant"],
"wildtype": row["wildtype"],
"cell_type": row["cell_type"],
}
)
agg_means_df = pd.DataFrame(agg_means)
return agg_means_df
binding_entry_by_domain_df = find_domain(final_merged_df)
display(binding_entry_by_domain_df)
# Step 2: Use pivot_table to reshape the DataFrame
df_pivot = binding_entry_by_domain_df.pivot_table(index=['region', 'site', 'wildtype','mutant'],
columns='cell_type',
values=['effect', 'binding_mean'],
aggfunc='first').reset_index()
# Step 3: Flatten the hierarchical column index
df_pivot.columns = ['_'.join(col).strip() if col[1] else col[0] for col in df_pivot.columns.values]
#display(df_pivot)
# Optionally, rename the columns to your desired format
df_pivot.rename(columns={
'effect_CHO-EFNB2': 'effect_E2',
'effect_CHO-EFNB3': 'effect_E3',
'binding_mean_CHO-EFNB2': 'binding_E2',
'binding_mean_CHO-EFNB3': 'binding_E3'
}, inplace=True)
display(df_pivot)
| region | binding_mean | effect | site | mutant | wildtype | cell_type | |
|---|---|---|---|---|---|---|---|
| 0 | Stalk | -0.78170 | -1.16400 | 71 | D | Q | CHO-EFNB2 |
| 1 | Stalk | 0.16590 | -1.25500 | 71 | E | Q | CHO-EFNB2 |
| 2 | Stalk | -0.34290 | -1.05800 | 71 | F | Q | CHO-EFNB2 |
| 3 | Stalk | 0.46570 | -1.42500 | 71 | G | Q | CHO-EFNB2 |
| 4 | Stalk | 0.02003 | -0.37640 | 71 | H | Q | CHO-EFNB2 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 19456 | Head | NaN | -1.66700 | 601 | F | C | CHO-EFNB3 |
| 19457 | Head | NaN | -2.04700 | 601 | G | C | CHO-EFNB3 |
| 19458 | Head | NaN | -0.75770 | 601 | I | C | CHO-EFNB3 |
| 19459 | Head | NaN | -1.52300 | 601 | P | C | CHO-EFNB3 |
| 19460 | Head | NaN | 0.01403 | 601 | V | C | CHO-EFNB3 |
19461 rows × 7 columns
| region | site | wildtype | mutant | binding_E2 | binding_E3 | effect_E2 | effect_E3 | |
|---|---|---|---|---|---|---|---|---|
| 0 | Head | 178 | V | A | 0.7066 | 0.008861 | -0.2181 | 0.01306 |
| 1 | Head | 178 | V | C | 0.1814 | 0.451400 | 0.1203 | 0.47640 |
| 2 | Head | 178 | V | D | NaN | -0.041930 | -1.9200 | -1.03800 |
| 3 | Head | 178 | V | E | NaN | 0.142800 | -1.7900 | -0.41900 |
| 4 | Head | 178 | V | F | 0.5869 | 0.039550 | -0.7901 | -0.34260 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9915 | Stalk | 147 | K | S | 0.1344 | -0.060950 | 0.1857 | 0.13650 |
| 9916 | Stalk | 147 | K | T | 1.0700 | -0.052750 | -0.3402 | -0.79560 |
| 9917 | Stalk | 147 | K | V | NaN | 0.086850 | -1.9730 | -1.02500 |
| 9918 | Stalk | 147 | K | W | NaN | NaN | -2.9010 | -2.27500 |
| 9919 | Stalk | 147 | K | Y | NaN | 0.146400 | -2.9410 | -1.39500 |
9920 rows × 8 columns
In [14]:
def correlation_plot(df):
options = ['Stalk', 'Neck', 'Linker','Head']
labels = [option + ' ' for option in options]
input_dropdown = alt.binding_radio(
# Add the empty selection which shows all when clicked
options=options + [None],
labels=labels + ['All'],
name='Region: '
)
selection = alt.selection_point(
fields=['region'],
bind=input_dropdown,
)
color = alt.condition(
selection,
alt.Color('region:N',scale=alt.Scale(domain=options)),
alt.value('lightgray'),
#sort=options,
)
opacity = alt.condition(
selection,
alt.value(1),
alt.value(0.5)
)
effect_chart = (
alt.Chart(df,title='Cell entry')
.mark_point(size=30,opacity=1,filled=True)
.encode(
alt.X("effect_E2", title="Entry in CHO-bEFNB2",axis=alt.Axis(tickCount=4)),
alt.Y("effect_E3", title="Entry in CHO-bEFNB3",axis=alt.Axis(tickCount=4)),
tooltip=["wildtype","site", "mutant"],
opacity=opacity,
color=color,
)
).properties(height=400,width=400)
binding_chart = (
alt.Chart(df,title='Receptor binding')
.mark_point(size=30,opacity=1,filled=True)
.encode(
alt.X("binding_E2", title="bEFNB2 Binding",axis=alt.Axis(tickCount=4)),
alt.Y("binding_E3", title="bEFNB3 Binding",axis=alt.Axis(tickCount=4)),
tooltip=["site", "mutant"],
color=color,
opacity=opacity
)
).properties(height=400,width=400)
combined_chart = effect_chart | binding_chart
combined_chart=combined_chart.add_params(selection) #.properties(title=alt.Title('Entry and Binding Correlations',subtitle='Select radio button to see mutants highlighted'))
return combined_chart
corr_entry_binding_large = correlation_plot(df_pivot)
corr_entry_binding_large.display()
if entry_by_site_plot_e3_output is not None:
corr_entry_binding_large.save(corr_entry_binding_large_output)
In [15]:
def make_custom_figure(df,name):
brush = alt.selection_interval() #define selection brush
custom_order = ["Stalk", "Neck", "Linker", "Head"]
chart = (
alt.Chart(
df,
title=alt.Title(
f'{name}',
#subtitle='Select points to see the top 10 summed sites'
)
)
.mark_point(
opacity=0.3,
filled=True
)
.encode(
alt.X(
"binding_mean",
title=f"Binding",
axis=alt.Axis(tickCount=4),
),
alt.Y(
"region:O",
sort=custom_order,
title="RBP Region",
),
yOffset="random:Q",
tooltip=["region", "binding_mean", "site", "mutant"],
color=alt.condition(brush, 'region', alt.value('lightgray')),
)
.transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())")
).add_params(brush).properties(height=200,width=400)
bars = alt.Chart(df).transform_filter(
brush
).transform_aggregate(
binding_aggr='sum(binding_mean)',
groupby=['site', 'region']
).transform_window(
rank='rank(binding_aggr)',
sort=[alt.SortField('binding_aggr', order='descending')]
).transform_filter(
alt.datum.rank <= 10
).mark_bar().encode(
y=alt.Y('binding_aggr:Q',title='Binding'),
x=alt.X('site:N', sort='-y',title='Site'),
color=alt.Color('region',title='Region')
).properties(height=50,width=400)
combined_chart = chart & bars
combined_chart
return combined_chart
In [16]:
efnb2_binding_region = make_custom_figure(binding_entry_by_domain_df.query('cell_type == "CHO-EFNB2"'),'bEFNB2')
efnb2_binding_region.display()
#efnb2_binding_region.save('results/images/efnb2_binding_region.html')
In [17]:
efnb3_binding_region = make_custom_figure(binding_entry_by_domain_df.query('cell_type == "CHO-EFNB3"'),'bEFNB3')
efnb3_binding_region.display()
#efnb3_binding_region.save('results/images/efnb3_binding_region.html')
In [18]:
combined_binding = (efnb2_binding_region | efnb3_binding_region).properties(title=alt.Title('Receptor binding by RBP mutant',subtitle='Draw boxes around scatter plots to see top sites'))
combined_binding.display()
if entry_by_site_plot_e3_output is not None:
combined_binding.save(combined_binding_output)
In [19]:
def entry_by_site(df):
tmp_df = df.groupby(['site','cell_type'])['effect'].mean().reset_index()
# define ranges of different RBP regions
barrel_ranges = {
"Stalk": list(range(70, 148)),
"Neck": list(range(148, 166)),
"Linker": list(range(166, 178)),
"Head": list(range(178, 602)),
}
custom_order = ["Stalk", "Neck", "Linker", "Head"] #custom order for color legend
# For each barrel, filter the dataframe to the sites belonging to that barrel and then store the means
agg_means = [] #store aggregation
for barrel, sites in barrel_ranges.items():
subset = tmp_df[tmp_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"region": barrel,
"effect": row["effect"],
"site": row["site"],
"cell_type": row["cell_type"],
}
)
agg_means_df = pd.DataFrame(agg_means).round(3)
agg_means_df['beta_sheet'] = agg_means_df['site'].isin(config['beta_sheet']) #add a column specifying which sites are in beta sheets
### The main chart plotting
variant_selector = alt.selection_point(
on="mouseover", empty=False,nearest=True, fields=["site"], value=1
)
chart = (
alt.Chart(
agg_means_df,
)
.mark_bar(opacity=1,stroke='black')
.encode(
alt.X("site:N", title='Site',axis=alt.Axis(labelAngle=-90,values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
alt.Y("effect", title="Mean entry"),
tooltip=["site", "effect","region"],
color=alt.Color('region',sort=custom_order,title='Region'),
strokeWidth=alt.condition(
variant_selector, alt.value(1), alt.value(0)
),
row=alt.Row('cell_type',title=None,header=alt.Header(labelFontSize=15,labelFontWeight='bold'))
)
).properties(width=800,height=150)
### Draw rectanges showing where beta sheets are in protein above chart
rect = alt.Chart(agg_means_df).mark_rect(color='gray').encode(
alt.X('site:N',axis=None),
alt.Y('beta_sheet',axis=None),
tooltip=['site','beta_sheet']
).transform_filter('datum.beta_sheet == true').properties(width=800,height=10)
#select_bar = alt.Chart(agg_means_df).mark_rect(color='gray').encode(
#combined_chart = alt.layer(chart,rect)
combined_chart = alt.vconcat(rect,chart,padding=0).resolve_scale(y='independent',x='shared')
combined_chart = combined_chart.properties(title=alt.Title('Cell entry by RBP mutations',
subtitle=['Hover mouse over bars to view information about cell entry','Gray bars are beta sheets'])).add_params(variant_selector)
return combined_chart
entry_by_site_plot = entry_by_site(concat_df)
entry_by_site_plot.display()
#entry_by_site_plot.save('results/images/entry_by_site_plot.html')
Make interactive chart for individual mutations¶
In [20]:
def entry_by_site(df,name,effect):
amino_acid_order = ["R","K","H","D","E","Q","N","S","T","Y","W","F","A","I","L","M","V","G","P","C"]
# make an empty dataframe with every possible mutation so empty values still get plotted
sites = range(71, 603)
data = [{"site": site, "mutant": aa} for site in sites for aa in amino_acid_order]
empty_df = pd.DataFrame(data)
full_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
#setup interactive features
variant_selector = alt.selection_point(
on="mouseover", empty=False,nearest=True, fields=["site"], value=1
)
#make base chart
base = alt.Chart(full_df).add_params(variant_selector)
#add bar chart of cell entry by site
chart = base.mark_bar(opacity=1,stroke='black').encode(
alt.X("site:N", title='Site',axis=alt.Axis(values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
alt.Y(f"mean({effect})", title="Mean entry"),
tooltip=["site", "wildtype","region"],
opacity=alt.condition(variant_selector,alt.value(1),alt.value(0.7)),
strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
color=alt.Color('region',title='Region')
).properties(width=800,height=200)
#add heatmap below showing effect of mutants on cell entry
select_bar = alt.Chart(full_df).mark_bar(stroke='black').encode(
alt.X('mutant:N',title=None,scale=alt.Scale(domain=amino_acid_order)),
color=alt.Color(f'{effect}',legend=alt.Legend(orient='right',direction='horizontal',titleAlign='center',titleAnchor='middle'),title='Cell entry',scale=alt.Scale(scheme='redblue',domainMid=0,domain=[-4,2]))
).transform_filter(
variant_selector
).properties(width=400,height=10)
#make heatmap have an x for wildtype residue
select_bar_wildtype = alt.Chart(full_df).mark_text(color="black", text="X", size=10, align="center", baseline="middle").encode(
alt.X('wildtype:N',title='Amino acid'),
).transform_filter(
variant_selector
).transform_filter(
(alt.datum[effect] != None) #filter out empty data
).properties(width=400,height=10)
combined_effects_w_wildtype = alt.layer(select_bar_wildtype,select_bar).resolve_scale(x='shared')
# combine the charts
combined_chart = alt.vconcat(chart,combined_effects_w_wildtype).resolve_scale(y='independent',x='independent')
combined_chart = combined_chart = combined_chart.properties(title=alt.Title(f'Entry in {name}',
subtitle=['Hover over bars to view information about specific mutations']))
return combined_chart
In [21]:
# call chart function
entry_by_site_plot_e2 = entry_by_site(df_pivot,'CHO-bEFNB2','effect_E2')
entry_by_site_plot_e2.display()
if entry_by_site_plot_e3_output is not None:
entry_by_site_plot_e2.save(entry_by_site_plot_e2_output)
In [22]:
# call chart function
entry_by_site_plot_e3 = entry_by_site(df_pivot,'CHO-bEFNB3','effect_E3')
entry_by_site_plot_e3.display()
if entry_by_site_plot_e3_output is not None:
entry_by_site_plot_e3.save(entry_by_site_plot_e3_output)
In [23]:
combined_entry_by_site = (entry_by_site_plot_e2 & entry_by_site_plot_e3)
combined_entry_by_site.display()
Testing stuff below¶
In [24]:
def make_effect_by_site_with_hover_tooltip(df):
tmp_df = df.groupby(['cell_type','site'])['effect'].mean().reset_index().round(2)
# Create a selection that chooses the nearest point & selects based on x-value
nearest = alt.selection_point(nearest=True, on='mouseover',
fields=['site'], empty=False)
# The basic line
line = alt.Chart(tmp_df).mark_line(interpolate='basis',size=1).encode(
alt.X('site:Q', title='Site',axis=alt.Axis(values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600])),
alt.Y('effect:Q',title='Mean entry'),
color=alt.Color('cell_type:N',title='Cell type')
)
# Transparent selectors across the chart. This is what tells us
# the x-value of the cursor
selectors = alt.Chart(tmp_df).mark_point().encode(
alt.X('site:Q'),
opacity=alt.value(0),
).add_params(
nearest
)
# Draw points on the line, and highlight based on selection
points = line.mark_point().encode(
opacity=alt.condition(nearest, alt.value(1), alt.value(0))
)
# Draw text labels near the points, and highlight based on selection
text = line.mark_text(align='left', dx=5, dy=-5,fontSize=15).encode(
text=alt.condition(nearest, 'effect:Q', alt.value(' ')),
#color=alt.value('black')
)#.transform_filter(alt.datum.cell_type == 'CHO-EFNB2')
# Draw a rule at the location of the selection
rules = alt.Chart(tmp_df).mark_rule(color='gray').encode(
x='site:Q',
).transform_filter(
nearest
)
# Put the five layers into a chart and bind the data
combined_chart = alt.layer(
line, selectors, points, rules, text
).properties(
width=800, height=200
)
return combined_chart
alt_plot = make_effect_by_site_with_hover_tooltip(binding_entry_by_domain_df)
alt_plot.display()
In [ ]:
In [25]:
display(df_pivot)
| region | site | wildtype | mutant | binding_E2 | binding_E3 | effect_E2 | effect_E3 | |
|---|---|---|---|---|---|---|---|---|
| 0 | Head | 178 | V | A | 0.7066 | 0.008861 | -0.2181 | 0.01306 |
| 1 | Head | 178 | V | C | 0.1814 | 0.451400 | 0.1203 | 0.47640 |
| 2 | Head | 178 | V | D | NaN | -0.041930 | -1.9200 | -1.03800 |
| 3 | Head | 178 | V | E | NaN | 0.142800 | -1.7900 | -0.41900 |
| 4 | Head | 178 | V | F | 0.5869 | 0.039550 | -0.7901 | -0.34260 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9915 | Stalk | 147 | K | S | 0.1344 | -0.060950 | 0.1857 | 0.13650 |
| 9916 | Stalk | 147 | K | T | 1.0700 | -0.052750 | -0.3402 | -0.79560 |
| 9917 | Stalk | 147 | K | V | NaN | 0.086850 | -1.9730 | -1.02500 |
| 9918 | Stalk | 147 | K | W | NaN | NaN | -2.9010 | -2.27500 |
| 9919 | Stalk | 147 | K | Y | NaN | 0.146400 | -2.9410 | -1.39500 |
9920 rows × 8 columns
In [26]:
aggregated_df = df_pivot.groupby(['site']).agg({
#'binding_E2': 'mean',
#'binding_E3': 'mean',
'effect_E2': 'mean',
'effect_E3': 'mean',
'region': 'first',
'wildtype': 'first',
'region': 'first'
}).reset_index().round(2)
melted_df = pd.melt(aggregated_df, id_vars=['site','wildtype'], value_vars=['effect_E2','effect_E3','region'], var_name='selection', value_name='effect')
display(melted_df)
| site | wildtype | selection | effect | |
|---|---|---|---|---|
| 0 | 71 | Q | effect_E2 | -1.18 |
| 1 | 72 | N | effect_E2 | -1.23 |
| 2 | 73 | Y | effect_E2 | -0.74 |
| 3 | 74 | T | effect_E2 | -0.68 |
| 4 | 75 | R | effect_E2 | -0.73 |
| ... | ... | ... | ... | ... |
| 1588 | 597 | I | region | Head |
| 1589 | 598 | P | region | Head |
| 1590 | 599 | E | region | Head |
| 1591 | 600 | Q | region | Head |
| 1592 | 601 | C | region | Head |
1593 rows × 4 columns
In [27]:
full_ranges = [
list(range(start, end))
for start, end in [(71, 136), (136, 201), (201, 266), (266, 331), (331, 396), (396, 461), (461, 526), (526, 591), (591, 602)]
#(71, 181),
#(181, 291),
#(291, 401),
#(401, 511),
#(511, 603),
]
print(full_ranges)
[[71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135], [136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200], [201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265], [266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330], [331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395], [396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460], [461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525], [526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590], [591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601]]
In [28]:
df = pd.DataFrame(
{
"site": config["contact_sites"],
"contact": [0.0] * len(config["contact_sites"]),
}
)
df["selection"] = "receptor contact"
df.rename(columns={"contact": "effect"}, inplace=True)
melted_df = pd.concat([df,melted_df])
display(melted_df)
| site | effect | selection | wildtype | |
|---|---|---|---|---|
| 0 | 239 | 0.0 | receptor contact | NaN |
| 1 | 240 | 0.0 | receptor contact | NaN |
| 2 | 241 | 0.0 | receptor contact | NaN |
| 3 | 242 | 0.0 | receptor contact | NaN |
| 4 | 305 | 0.0 | receptor contact | NaN |
| ... | ... | ... | ... | ... |
| 1588 | 597 | Head | region | I |
| 1589 | 598 | Head | region | P |
| 1590 | 599 | Head | region | E |
| 1591 | 600 | Head | region | Q |
| 1592 | 601 | Head | region | C |
1626 rows × 4 columns
In [29]:
empty_chart = []
to_sort = ['receptor_contact','effect_E2','effect_E3']
for idx,site_subset in enumerate(full_ranges):
tmp_df = melted_df[melted_df['site'].isin(site_subset)]
display(tmp_df)
is_last_plot = idx == len(full_ranges) - 1
x_axis = alt.Axis(
labelAngle=-90,
labelExpr="datum.value % 10 === 0 ? datum.value : ''",
title="Site" if is_last_plot else None,
labels=True,
)
effect_legend = (
alt.Legend(
#title=legend_title,
direction="horizontal",
gradientLength=150,
titleAnchor="middle",
tickCount=3,
labelAlign="center",
)
if is_last_plot
else None
)
base = alt.Chart(tmp_df).encode(
alt.X("site:O", title="Site",axis=x_axis),
alt.Y('selection:N',title=None,sort=to_sort),
tooltip=['site','selection','effect']
).properties(width=alt.Step(10),height=alt.Step(10))
entry = base.mark_rect(stroke='black',strokeWidth=0.5).encode(
alt.Color('effect',legend=effect_legend)
.scale(scheme='redblue',domainMid=0,domain=[-4,2])
).transform_filter(
(alt.datum.selection == "effect_E2") | (alt.datum.selection == "effect_E3")
)
region = base.mark_rect(stroke='black',strokeWidth=0.5).encode(
alt.Color('effect',legend=effect_legend)
).transform_filter(alt.datum.selection == 'region')
contact = base.mark_rect(color='black',stroke='black',strokeWidth=0.5).encode(
).transform_filter(
(alt.datum.selection == "receptor contact")
)
tmp_chart = alt.layer(region, entry,contact)
empty_chart.append(tmp_chart)
test = alt.vconcat(*empty_chart,spacing=0)
test
| site | effect | selection | wildtype | |
|---|---|---|---|---|
| 0 | 71 | -1.18 | effect_E2 | Q |
| 1 | 72 | -1.23 | effect_E2 | N |
| 2 | 73 | -0.74 | effect_E2 | Y |
| 3 | 74 | -0.68 | effect_E2 | T |
| 4 | 75 | -0.73 | effect_E2 | R |
| ... | ... | ... | ... | ... |
| 1122 | 131 | Stalk | region | I |
| 1123 | 132 | Stalk | region | S |
| 1124 | 133 | Stalk | region | Q |
| 1125 | 134 | Stalk | region | S |
| 1126 | 135 | Stalk | region | T |
195 rows × 4 columns
| site | effect | selection | wildtype | |
|---|---|---|---|---|
| 65 | 136 | -0.6 | effect_E2 | A |
| 66 | 137 | -0.41 | effect_E2 | S |
| 67 | 138 | -2.86 | effect_E2 | I |
| 68 | 139 | -0.96 | effect_E2 | N |
| 69 | 140 | 0.07 | effect_E2 | E |
| ... | ... | ... | ... | ... |
| 1187 | 196 | Head | region | Q |
| 1188 | 197 | Head | region | I |
| 1189 | 198 | Head | region | L |
| 1190 | 199 | Head | region | K |
| 1191 | 200 | Head | region | P |
195 rows × 4 columns
| site | effect | selection | wildtype | |
|---|---|---|---|---|
| 0 | 239 | 0.0 | receptor contact | NaN |
| 1 | 240 | 0.0 | receptor contact | NaN |
| 2 | 241 | 0.0 | receptor contact | NaN |
| 3 | 242 | 0.0 | receptor contact | NaN |
| 130 | 201 | -1.32 | effect_E2 | K |
| ... | ... | ... | ... | ... |
| 1252 | 261 | Head | region | E |
| 1253 | 262 | Head | region | V |
| 1254 | 263 | Head | region | P |
| 1255 | 264 | Head | region | S |
| 1256 | 265 | Head | region | L |
199 rows × 4 columns
| site | effect | selection | wildtype | |
|---|---|---|---|---|
| 4 | 305 | 0.0 | receptor contact | NaN |
| 195 | 266 | -2.89 | effect_E2 | F |
| 196 | 267 | -1.96 | effect_E2 | M |
| 197 | 268 | -0.9 | effect_E2 | T |
| 198 | 269 | -0.99 | effect_E2 | N |
| ... | ... | ... | ... | ... |
| 1317 | 326 | Head | region | N |
| 1318 | 327 | Head | region | G |
| 1319 | 328 | Head | region | G |
| 1320 | 329 | Head | region | G |
| 1321 | 330 | Head | region | Y |
196 rows × 4 columns
| site | effect | selection | wildtype | |
|---|---|---|---|---|
| 5 | 388 | 0.0 | receptor contact | NaN |
| 6 | 389 | 0.0 | receptor contact | NaN |
| 260 | 331 | -1.13 | effect_E2 | N |
| 261 | 332 | -0.08 | effect_E2 | Q |
| 262 | 333 | -0.18 | effect_E2 | H |
| ... | ... | ... | ... | ... |
| 1382 | 391 | Head | region | K |
| 1383 | 392 | Head | region | P |
| 1384 | 393 | Head | region | E |
| 1385 | 394 | Head | region | N |
| 1386 | 395 | Head | region | C |
197 rows × 4 columns
| site | effect | selection | wildtype | |
|---|---|---|---|---|
| 7 | 401 | 0.0 | receptor contact | NaN |
| 8 | 402 | 0.0 | receptor contact | NaN |
| 9 | 458 | 0.0 | receptor contact | NaN |
| 325 | 396 | -0.56 | effect_E2 | R |
| 326 | 397 | -0.31 | effect_E2 | L |
| ... | ... | ... | ... | ... |
| 1447 | 456 | Head | region | A |
| 1448 | 457 | Head | region | S |
| 1449 | 458 | Head | region | F |
| 1450 | 459 | Head | region | S |
| 1451 | 460 | Head | region | W |
198 rows × 4 columns
| site | effect | selection | wildtype | |
|---|---|---|---|---|
| 10 | 488 | 0.0 | receptor contact | NaN |
| 11 | 489 | 0.0 | receptor contact | NaN |
| 12 | 490 | 0.0 | receptor contact | NaN |
| 13 | 491 | 0.0 | receptor contact | NaN |
| 14 | 492 | 0.0 | receptor contact | NaN |
| ... | ... | ... | ... | ... |
| 1512 | 521 | Head | region | S |
| 1513 | 522 | Head | region | A |
| 1514 | 523 | Head | region | G |
| 1515 | 524 | Head | region | V |
| 1516 | 525 | Head | region | F |
205 rows × 4 columns
| site | effect | selection | wildtype | |
|---|---|---|---|---|
| 20 | 530 | 0.0 | receptor contact | NaN |
| 21 | 531 | 0.0 | receptor contact | NaN |
| 22 | 532 | 0.0 | receptor contact | NaN |
| 23 | 533 | 0.0 | receptor contact | NaN |
| 24 | 555 | 0.0 | receptor contact | NaN |
| ... | ... | ... | ... | ... |
| 1577 | 586 | Head | region | N |
| 1578 | 587 | Head | region | V |
| 1579 | 588 | Head | region | I |
| 1580 | 589 | Head | region | R |
| 1581 | 590 | Head | region | P |
208 rows × 4 columns
| site | effect | selection | wildtype | |
|---|---|---|---|---|
| 520 | 591 | -0.49 | effect_E2 | K |
| 521 | 592 | -2.07 | effect_E2 | L |
| 522 | 593 | -1.86 | effect_E2 | F |
| 523 | 594 | -2.54 | effect_E2 | A |
| 524 | 595 | -1.22 | effect_E2 | V |
| 525 | 596 | -0.0 | effect_E2 | K |
| 526 | 597 | -2.19 | effect_E2 | I |
| 527 | 598 | -1.04 | effect_E2 | P |
| 528 | 599 | 0.01 | effect_E2 | E |
| 529 | 600 | 0.07 | effect_E2 | Q |
| 530 | 601 | -1.06 | effect_E2 | C |
| 1051 | 591 | -1.46 | effect_E3 | K |
| 1052 | 592 | -1.72 | effect_E3 | L |
| 1053 | 593 | -1.93 | effect_E3 | F |
| 1054 | 594 | -2.25 | effect_E3 | A |
| 1055 | 595 | -0.53 | effect_E3 | V |
| 1056 | 596 | -0.0 | effect_E3 | K |
| 1057 | 597 | -2.06 | effect_E3 | I |
| 1058 | 598 | -0.82 | effect_E3 | P |
| 1059 | 599 | 0.12 | effect_E3 | E |
| 1060 | 600 | 0.18 | effect_E3 | Q |
| 1061 | 601 | -0.77 | effect_E3 | C |
| 1582 | 591 | Head | region | K |
| 1583 | 592 | Head | region | L |
| 1584 | 593 | Head | region | F |
| 1585 | 594 | Head | region | A |
| 1586 | 595 | Head | region | V |
| 1587 | 596 | Head | region | K |
| 1588 | 597 | Head | region | I |
| 1589 | 598 | Head | region | P |
| 1590 | 599 | Head | region | E |
| 1591 | 600 | Head | region | Q |
| 1592 | 601 | Head | region | C |
Out[29]:
In [ ]: